Örneklem çekme

verimiz çok büyük olduğu için %4lük bir örneklem çekerek 400 gözlemlik bir veri seti elde ediyoruz.

library(readxl)
ogveri <- read_excel("C:/Users/SİMAY/Documents/Veri Setleri/ogveri.xlsx")
View(ogveri)

Kategorik değişkenleri tanımlıyoruz

ogveri$`Ship Mode` <- factor(ogveri$`Ship Mode`, levels=c("Second Class","Standard Class","First Class","Same Day"))
ogveri$Segment <- factor(ogveri$Segment, levels=c ("Consumer","Home Office","Corporate"))
ogveri$Country <- factor(ogveri$Country, levels=c ("United States"))
ogveri$Country <- factor(ogveri$Country, levels=c ("United States"))
ogveri$Region <- factor(ogveri$Region, levels=c("West","Central","East","South"))
ogveri$Region <- factor(ogveri$Region, levels=c("West","Central","East","South"))
ogveri$Category <- factor(ogveri$Category, levels=c("Technology","Office Supplies","Furniture"))
ogveri$`Sub-Category` <- as.factor(ogveri$`Sub-Category`)
ogveri$City <- as.factor(ogveri$City)
ogveri$State <- as.factor(ogveri$State)
Quantity <- as.numeric(ogveri$Quantity)
Discount <- as.numeric(ogveri$Discount)
Profit <- as.numeric(ogveri$Profit)
Sales <- as.numeric(ogveri$Sales)
summary(ogveri)
##           Ship Mode          Segment             Country               City    
##  Second Class  : 68   Consumer   :207   United States:400   New York City: 35  
##  Standard Class:247   Home Office: 71                       Los Angeles  : 29  
##  First Class   : 63   Corporate  :122                       Chicago      : 20  
##  Same Day      : 22                                         San Francisco: 17  
##                                                             Seattle      : 17  
##                                                             Houston      : 16  
##                                                             (Other)      :266  
##         State         Region               Category        Sub-Category
##  California: 81   West   :120   Technology     : 75   Binders    : 62  
##  New York  : 45   Central: 97   Office Supplies:240   Paper      : 55  
##  Texas     : 42   East   :114   Furniture      : 85   Furnishings: 35  
##  Illinois  : 30   South  : 69                         Phones     : 35  
##  Ohio      : 20                                       Chairs     : 34  
##  Florida   : 18                                       Accessories: 33  
##  (Other)   :164                                       (Other)    :146  
##      Sales              Quantity        Discount          Profit         
##  Min.   :    1.504   Min.   : 1.00   Min.   :0.0000   Min.   :-3701.893  
##  1st Qu.:   14.778   1st Qu.: 2.00   1st Qu.:0.0000   1st Qu.:    1.544  
##  Median :   48.388   Median : 3.00   Median :0.2000   Median :    7.370  
##  Mean   :  264.304   Mean   : 3.85   Mean   :0.1678   Mean   :   38.284  
##  3rd Qu.:  197.519   3rd Qu.: 5.00   3rd Qu.:0.2000   3rd Qu.:   27.750  
##  Max.   :13999.960   Max.   :14.00   Max.   :0.8000   Max.   : 6719.981  
## 

çektiğimiz örneklemi bilgisayara aktarıyoruz.

library("openxlsx")
## Warning: package 'openxlsx' was built under R version 4.1.3
write.xlsx(ogveri, 'ogveri.xlsx')

Eksik Veriler

rowSums(is.na(ogveri))
##   [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [223] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [297] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [334] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [371] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
colSums(is.na(ogveri))
##    Ship Mode      Segment      Country         City        State       Region 
##            0            0            0            0            0            0 
##     Category Sub-Category        Sales     Quantity     Discount       Profit 
##            0            0            0            0            0            0

Veride eksik değişken olmadığı için kendimiz yaratıyoruz

data_miss<-ogveri
aa<-sample(1:nrow(data_miss),floor(nrow(data_miss)*0.05))
data_miss$Quantity[aa]<-NA
colSums(is.na(data_miss))
##    Ship Mode      Segment      Country         City        State       Region 
##            0            0            0            0            0            0 
##     Category Sub-Category        Sales     Quantity     Discount       Profit 
##            0            0            0           20            0            0
View(data_miss)

Mice paketini kullanarak eksik gözlemlerimizin yapısını inceliyoruz.

library(mice)
## Warning: package 'mice' was built under R version 4.1.3
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
md.pattern(data_miss,rotate.names = TRUE)

##     Ship Mode Segment Country City State Region Category Sub-Category Sales
## 380         1       1       1    1     1      1        1            1     1
## 20          1       1       1    1     1      1        1            1     1
##             0       0       0    0     0      0        0            0     0
##     Discount Profit Quantity   
## 380        1      1        1  0
## 20         1      1        0  1
##            0      0       20 20

Eksik gözlemlerin “Quantity” değişkeninde 20 adet olduğunu gözlemliyoruz. 380 tane de dolu veri bulunmakta.

Şimdi de aggr fonksiyonu ile eksik gözlemlerin yapısını inceleyelim. Bunun için öncelikle VIM ve ISLR kütüphanelerini import ediyoruz.

library(VIM)
## Warning: package 'VIM' was built under R version 4.1.3
## Zorunlu paket yükleniyor: colorspace
## Zorunlu paket yükleniyor: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.1.3
aggr(data_miss,col=c("navyblue","pink"),numbers=TRUE, sortVars=TRUE, labels=names(data_miss),cex.axis=.7,gap=3,ylab=c("Missing Ratio","Missing Pattern"))

## 
##  Variables sorted by number of missings: 
##      Variable Count
##      Quantity  0.05
##     Ship Mode  0.00
##       Segment  0.00
##       Country  0.00
##          City  0.00
##         State  0.00
##        Region  0.00
##      Category  0.00
##  Sub-Category  0.00
##         Sales  0.00
##      Discount  0.00
##        Profit  0.00

Eksik gözlemlerin veri setimizin %0,05 kadarlık bir kısmını kapladığını görüyoruz. Eksik gözlemlerin yalnızca “Quantity” değişkeninde bulunduğunu da görebiliyoruz.

Karar ağacı yöntemiyle eksik gözlemlerimizi doldurmayı tercih ediyoruz. Karar ağacı öğrenmesi (decision tree learning) yöntemi, makine öğrenmesi (machine learning) konularından birisidir. Literatürde karar ağacı öğrenmesinin alt yöntemleri olarak kabul edilebilecek sınıflandırma ağacı (classification tree) veya ilkelleştirme ağacı (regression tree ,tahmin ağacı) gibi uygulamaları vardır.

Karar ağacı öğrenmesinde, bir ağaç yapısı oluşturularak ağacın yaprakları seviyesinde sınıf etiketleri ve bu yapraklara giden ve başlangıçtan çıkan kollar ile de özellikler üzerindeki işlemeler ifade edilmektedir.

library(rpart)
data_dt<-data_miss
rtree <- rpart(Quantity ~ Discount + Profit+ Sales, data_dt, method="anova")
library(rattle)
## Warning: package 'rattle' was built under R version 4.1.3
## Zorunlu paket yükleniyor: tibble
## Warning: package 'tibble' was built under R version 4.1.3
## Zorunlu paket yükleniyor: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## 
## Attaching package: 'rattle'
## The following object is masked from 'package:VIM':
## 
##     wine
fancyRpartPlot(rtree,cex=0.5)

data_dt$Quantity <- ifelse(is.na(data_dt$Quantity), predict(rtree,data_dt,type="vector"),data_dt$Quantity)
library(mice)
md.pattern(data_dt,rotate.names = TRUE)
##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##     Ship Mode Segment Country City State Region Category Sub-Category Sales
## 400         1       1       1    1     1      1        1            1     1
##             0       0       0    0     0      0        0            0     0
##     Quantity Discount Profit  
## 400        1        1      1 0
##            0        0      0 0

Kullanılmayan değişkenlerin silinmesi

eksik değerleri tamamlanmış data_dt’yi bilgisayarımıza kaydediyoruz

library("openxlsx")
write.xlsx(data_dt, "data_imputed.xlsx")

Veri setini eğitim ve test veri kümesi olarak bölmek

Rastgeleliği sabitlemek için seed fonksiyonunu kullanıyoruz. Veri setimizin %80’i ile bir eğitim verisi oluşturuyoruz.

set.seed(52685136)
trainIndex <- sample(1:nrow(ogveri), size = round(0.8*nrow(ogveri)), replace=FALSE)
tra <- ogveri[trainIndex,]
tst <- ogveri[-trainIndex,]
library("openxlsx")
write.xlsx(tra, 'train.xlsx')
write.xlsx(tst, 'test.xlsx')

Eğitim verimizi data frame formatına çeviriyoruz. Analizlerimizde bundan sonrası için eğitim verimizi kullanacağız.

tra<-as.data.frame(tra)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
glimpse(tra)
## Rows: 320
## Columns: 12
## $ `Ship Mode`    <fct> Standard Class, Standard Class, Standard Class, Standar~
## $ Segment        <fct> Consumer, Consumer, Consumer, Corporate, Consumer, Corp~
## $ Country        <fct> United States, United States, United States, United Sta~
## $ City           <fct> Belleville, San Francisco, Columbus, Newark, Huntsville~
## $ State          <fct> New Jersey, California, Georgia, Delaware, Texas, Texas~
## $ Region         <fct> East, West, South, East, Central, Central, Central, Cen~
## $ Category       <fct> Technology, Furniture, Office Supplies, Furniture, Furn~
## $ `Sub-Category` <fct> Accessories, Furnishings, Binders, Chairs, Tables, Stor~
## $ Sales          <dbl> 239.970, 16.740, 36.400, 291.100, 211.372, 37.224, 7.99~
## $ Quantity       <dbl> 3, 2, 5, 5, 2, 3, 1, 7, 4, 5, 3, 4, 3, 5, 3, 7, 2, 3, 6~
## $ Discount       <dbl> 0.0, 0.0, 0.0, 0.0, 0.3, 0.2, 0.2, 0.8, 0.7, 0.0, 0.2, ~
## $ Profit         <dbl> 71.9910, 4.3524, 17.1080, 75.6860, -45.2940, 3.7224, 0.~
summary(tra)
##           Ship Mode          Segment             Country               City    
##  Second Class  : 52   Consumer   :161   United States:320   New York City: 30  
##  Standard Class:204   Home Office: 59                       Los Angeles  : 26  
##  First Class   : 46   Corporate  :100                       Chicago      : 17  
##  Same Day      : 18                                         San Francisco: 14  
##                                                             Houston      : 12  
##                                                             Philadelphia : 12  
##                                                             (Other)      :209  
##         State         Region              Category        Sub-Category
##  California: 63   West   :91   Technology     : 64   Binders    : 49  
##  New York  : 37   Central:80   Office Supplies:189   Paper      : 40  
##  Texas     : 34   East   :91   Furniture      : 67   Phones     : 30  
##  Illinois  : 25   South  :58                         Accessories: 29  
##  Florida   : 17                                      Art        : 28  
##  Ohio      : 15                                      Chairs     : 27  
##  (Other)   :129                                      (Other)    :117  
##      Sales             Quantity         Discount          Profit         
##  Min.   :   1.504   Min.   : 1.000   Min.   :0.0000   Min.   :-3701.893  
##  1st Qu.:  13.885   1st Qu.: 2.000   1st Qu.:0.0000   1st Qu.:    1.028  
##  Median :  50.180   Median : 3.000   Median :0.2000   Median :    6.815  
##  Mean   : 228.150   Mean   : 3.791   Mean   :0.1681   Mean   :   20.582  
##  3rd Qu.: 194.816   3rd Qu.: 5.000   3rd Qu.:0.2000   3rd Qu.:   26.892  
##  Max.   :5083.960   Max.   :14.000   Max.   :0.8000   Max.   : 1906.485  
## 

Sayısal verileri kategorize etmek

“Discount” değişkenimizi değerler 0’ın altında ise “indirim yok” 0’ın üstüne ise “indirimli” olarak kategorize ediyoruz.

tra$Indirim_kat[tra$Discount <= 0] <- "indirim yok"
tra$Indirim_kat[tra$Discount > 0] <- "indirimli"
tra$Karlilik[tra$Profit >= 0] <- "kar"
tra$Karlilik[tra$Profit < 0] <- "zarar"
tra$Karlilik <- as.factor(tra$Karlilik)
tra$Indirim_kat <- as.factor(tra$Indirim_kat)
summary(tra)
##           Ship Mode          Segment             Country               City    
##  Second Class  : 52   Consumer   :161   United States:320   New York City: 30  
##  Standard Class:204   Home Office: 59                       Los Angeles  : 26  
##  First Class   : 46   Corporate  :100                       Chicago      : 17  
##  Same Day      : 18                                         San Francisco: 14  
##                                                             Houston      : 12  
##                                                             Philadelphia : 12  
##                                                             (Other)      :209  
##         State         Region              Category        Sub-Category
##  California: 63   West   :91   Technology     : 64   Binders    : 49  
##  New York  : 37   Central:80   Office Supplies:189   Paper      : 40  
##  Texas     : 34   East   :91   Furniture      : 67   Phones     : 30  
##  Illinois  : 25   South  :58                         Accessories: 29  
##  Florida   : 17                                      Art        : 28  
##  Ohio      : 15                                      Chairs     : 27  
##  (Other)   :129                                      (Other)    :117  
##      Sales             Quantity         Discount          Profit         
##  Min.   :   1.504   Min.   : 1.000   Min.   :0.0000   Min.   :-3701.893  
##  1st Qu.:  13.885   1st Qu.: 2.000   1st Qu.:0.0000   1st Qu.:    1.028  
##  Median :  50.180   Median : 3.000   Median :0.2000   Median :    6.815  
##  Mean   : 228.150   Mean   : 3.791   Mean   :0.1681   Mean   :   20.582  
##  3rd Qu.: 194.816   3rd Qu.: 5.000   3rd Qu.:0.2000   3rd Qu.:   26.892  
##  Max.   :5083.960   Max.   :14.000   Max.   :0.8000   Max.   : 1906.485  
##                                                                          
##       Indirim_kat   Karlilik  
##  indirim yok:147   kar  :250  
##  indirimli  :173   zarar: 70  
##                               
##                               
##                               
##                               
## 
glimpse(tra)
## Rows: 320
## Columns: 14
## $ `Ship Mode`    <fct> Standard Class, Standard Class, Standard Class, Standar~
## $ Segment        <fct> Consumer, Consumer, Consumer, Corporate, Consumer, Corp~
## $ Country        <fct> United States, United States, United States, United Sta~
## $ City           <fct> Belleville, San Francisco, Columbus, Newark, Huntsville~
## $ State          <fct> New Jersey, California, Georgia, Delaware, Texas, Texas~
## $ Region         <fct> East, West, South, East, Central, Central, Central, Cen~
## $ Category       <fct> Technology, Furniture, Office Supplies, Furniture, Furn~
## $ `Sub-Category` <fct> Accessories, Furnishings, Binders, Chairs, Tables, Stor~
## $ Sales          <dbl> 239.970, 16.740, 36.400, 291.100, 211.372, 37.224, 7.99~
## $ Quantity       <dbl> 3, 2, 5, 5, 2, 3, 1, 7, 4, 5, 3, 4, 3, 5, 3, 7, 2, 3, 6~
## $ Discount       <dbl> 0.0, 0.0, 0.0, 0.0, 0.3, 0.2, 0.2, 0.8, 0.7, 0.0, 0.2, ~
## $ Profit         <dbl> 71.9910, 4.3524, 17.1080, 75.6860, -45.2940, 3.7224, 0.~
## $ Indirim_kat    <fct> indirim yok, indirim yok, indirim yok, indirim yok, ind~
## $ Karlilik       <fct> kar, kar, kar, kar, zarar, kar, kar, zarar, zarar, kar,~

Aykırı/uç değerlerin tespiti

Aykırı/uç değerleri incelemek için “Quantity” değişkeni üzerinde box-plot grafiğini deniyoruz.

library(ggplot2)
ggplot() +
  aes(x = "", y = Quantity) +
  geom_boxplot(fill = "#0c4c8a") +
  theme_minimal()

Fakat box-plot grafiği bize istediğimiz kadar bilgi ve detay sağlayamamakta. Bu nedenle istatistiksel yöntem olan “Hampel filter” yardımı ile potansiyel aykırı değerleri bulmayı deneyeceğiz.

Frank Rudolf Hampel tarafından literatüre kazandırılan ve popülerleştirilen ortanca mutlak sapma değeri, veri setindeki gözlemlerin ortanca değerden ne kadar uzakta olduğunun ölçüsüdür.

aykırı değerleri belirlemek için alt ve üst eşik değerleri şöyle hesaplanır:

Alt eşik = Ortanca – 3 * (MAD), Üst eşik = Ortanca + 3 * (MAD)

Genel bir ifade olarak, ortanca değerden 3 ortanca mutlak sapma uzaklıkta olan gözlemleri aykırı değer olarak konumlayabilirsiniz.

lower_bound_profit <- median(Profit) - 3 * mad(Profit, constant = 1)
lower_bound_profit
## [1] -24.20725
upper_bound_profit <- median(Profit) + 3 * mad(Profit, constant = 1)
upper_bound_profit
## [1] 38.94725
outlier_ind_profit <- which(Profit < lower_bound_profit | Profit > upper_bound_profit)
outlier_ind_profit
##   [1]   1   2   8   9  12  29  31  34  35  36  38  42  43  44  45  50  52  59
##  [19]  67  72  76  91  92  93  95  96  97  99 103 105 106 109 110 123 134 135
##  [37] 136 137 138 141 146 151 155 156 159 166 174 180 181 191 193 195 198 207
##  [55] 210 211 216 222 225 227 229 233 235 238 241 243 247 252 253 254 263 266
##  [73] 268 275 280 289 292 294 295 296 299 306 311 313 317 319 320 321 328 329
##  [91] 332 337 339 343 345 352 354 361 362 366 371 372 373 377 378 381 384 385
## [109] 387 391 392 394 395 398 399
lower_bound_discount <- median(Discount) - 3 * mad(Discount, constant = 1)
lower_bound_discount
## [1] -0.4
upper_bound_discount <- median(Discount) + 3 * mad(Discount, constant = 1)
upper_bound_discount
## [1] 0.8
outlier_ind_discount <- which(Discount < lower_bound_discount | Discount > upper_bound_discount)
outlier_ind_discount
## integer(0)
lower_bound_sales <- median(Sales) - 3 * mad(Sales, constant = 1)
lower_bound_sales
## [1] -72.836
upper_bound_sales <- median(Sales) + 3 * mad(Sales, constant = 1)
upper_bound_sales
## [1] 169.612
outlier_ind_sales <- which(Sales < lower_bound_sales | Sales > upper_bound_sales)
outlier_ind_sales
##   [1]   1   2   8   9  31  34  35  36  38  43  50  52  53  59  65  67  72  75
##  [19]  76  77  89  93  95  96  97 105 107 109 110 121 123 124 127 134 135 136
##  [37] 137 138 141 146 155 156 179 180 181 195 198 200 201 203 210 211 217 222
##  [55] 227 229 235 238 241 243 247 251 252 254 261 266 268 273 275 280 289 290
##  [73] 294 295 296 299 302 306 311 313 314 317 320 321 325 328 329 332 335 337
##  [91] 349 351 352 354 357 361 364 366 371 377 378 381 384 385 387 391 392 394
## [109] 395 398 399
upper_bound_quantity <- median(Quantity) + 3 * mad(Quantity, constant = 1)
upper_bound_quantity
## [1] 6
lower_bound_quantity <- median(Quantity) - 3 * mad(Quantity, constant = 1)
lower_bound_quantity
## [1] 0
outlier_ind_quantity <- which(Quantity < lower_bound_quantity | Quantity > upper_bound_quantity)
outlier_ind_quantity
##  [1]   3  11  26  31  33  34  38  47  48  50  67  75  85  91 109 117 120 122 123
## [20] 135 136 138 148 149 155 166 167 173 176 209 224 246 247 254 255 267 268 270
## [39] 275 280 289 312 328 333 337 347 352 358 361 366 371 378 384 391 392 399

Uç değerler bilgi verici olduğu için eleme yapmamayı seçiyoruz.

Verilerin açıklayıcı/Keşfedici çözümlemesi

Histogram grafiği

hist(tra$Sales, col = "darkgreen")

Sales değişkenimiizin dağılımının bu grafiğe baktığımız zaman normal dağılıma çok uzak olduğunu gözlemiyoruz. Yüksek derecede sağa çarpıklık bulunmaktadır. İlerleyen adımlarda dönüşüm gerekmektedir.

hist(tra$Profit, col = "green")

Profit değişkenimiizin dağılımının bu grafiğe baktığımız zaman normal dağılıma çok uzak olduğunu gözlemiyoruz. Yüksek derecede sola çarpıklık bulunmaktadır. İlerleyen adımlarda dönüşüm gerekmektedir.

hist(tra$Discount, col = "yellow")

Discount değişkenimiizin dağılımının bu grafiğe baktığımız zaman normal dağılıma çok uzak olduğunu gözlemiyoruz. Yüksek derecede sağa çarpıklık bulunmaktadır. İlerleyen adımlarda dönüşüm gerekmektedir.

hist(tra$Quantity, col = "purple")

Quantity değişkenimiizin dağılımının bu grafiğe baktığımız zaman normal dağılıma çok uzak olduğunu gözlemiyoruz. Yüksek derecede sağa çarpıklık bulunmaktadır. İlerleyen adımlarda dönüşüm gerekmektedir.

quantity <- as.numeric(tra$Quantity)
discount <- as.numeric(tra$Discount)
profit <- as.numeric(tra$Profit)
sales <- as.numeric(tra$Sales)

Kutu grafiği

library(ggplot2)
ggplot() +
  aes(x = "", y = discount) +
  geom_boxplot(fill = "#0c4c8a") +
  theme_minimal()

Burda da Discount değişkeninin aykırı değerlere sahip olduğunu ve sağa çarpık olduğunu bir kez daha kutu grafiği sayesinde gözlemliyoruz.

library(ggplot2)
ggplot() +
  aes(x = "", y = quantity) +
  geom_boxplot(fill = "#0c4c8a") +
  theme_minimal()

Burda da Quantity değişkeninin aykırı değerlere sahip olduğunu ve sağa çarpık olduğunu bir kez daha kutu grafiği sayesinde gözlemliyoruz.

library(ggplot2)
ggplot() +
  aes(x = "", y = profit) +
  geom_boxplot(fill = "#0c4c8a") +
  theme_minimal()

Bu grafikte boxplot verinin dağılımından dolayı çok açıklayıcı bir görüntü sağlamamaktadır. Fakat yine de aykırı değerlerin olduğunu ve verininin dağılımının sola çarpık olduğunu gözlemleyebiliyoruz.

library(ggplot2)
ggplot() +
  aes(x = "", y = sales) +
  geom_boxplot(fill = "#0c4c8a") +
  theme_minimal()

Bu grafikte Sales değişkenlerinin birbirinden fazlasıyla değişkenlik gösterdiğini , verinin oldukça sağa çarpık olduğunu ve uç değerlerin çok fazla olduğunu gözlemleyebiliyoruz.

Q-Q çizimi

ggplot(tra, aes(sample=Profit))+stat_qq()

qqnorm(tra$Profit)

Profit değişkenimiz iç bükey (konkav) olduğunu bu nedenle sağa çarpık olduğunu diğer grafiklerdeki gibi gözlemleyebiliyoruz. Aynı zamanda aykırı değerlerin varlığı bu grafikte de aşikardır.

ggplot(tra, aes(sample=Discount))+stat_qq()

qqnorm(tra$Discount)

ggplot(tra, aes(sample=Sales))+stat_qq()

qqnorm(tra$Sales)

Dış bükey (konveks) olduğundan dağılımın sola çarpık olduğunu söyleyebiliriz. Aynı zamanda uç değerlerin varlığı da aşikardır.

ggplot(tra, aes(sample=Quantity))+stat_qq()

qqnorm(tra$Quantity)

Buradan quantity değişkeninin kesikli değişken olduğunu görüyoruz.

Matris formlu saçılım grafikleri

cor_tra<-tra[,c(9,10,11,12)]
library(GGally)
## Warning: package 'GGally' was built under R version 4.1.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
cor(cor_tra)
##                Sales    Quantity    Discount     Profit
## Sales     1.00000000  0.33806435 -0.01139695  0.2699386
## Quantity  0.33806435  1.00000000 -0.06993926  0.0575779
## Discount -0.01139695 -0.06993926  1.00000000 -0.2540963
## Profit    0.26993861  0.05757790 -0.25409629  1.0000000
plot(cor_tra)

ggpairs(cor_tra)

p <- GGally::ggpairs(tra[,c(1:3,6)], aes(color = tra$Indirim_kat))
p

p <- GGally::ggpairs(tra[,c(1:3,6)], aes(color = tra$Region))
p

Değişken türlerine göre incelemeler

library(funModeling)
## Warning: package 'funModeling' was built under R version 4.1.3
## Zorunlu paket yükleniyor: Hmisc
## Warning: package 'Hmisc' was built under R version 4.1.3
## Zorunlu paket yükleniyor: lattice
## Warning: package 'lattice' was built under R version 4.1.3
## Zorunlu paket yükleniyor: survival
## Zorunlu paket yükleniyor: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## funModeling v.1.9.4 :)
## Examples and tutorials at livebook.datascienceheroes.com
##  / Now in Spanish: librovivodecienciadedatos.ai
## 
## Attaching package: 'funModeling'
## The following object is masked from 'package:GGally':
## 
##     range01
profiling_num(tra)#niceller uzerinden
##   variable       mean     std_dev variation_coef       p_01      p_05    p_25
## 1    Sales 228.149556 517.1744894      2.2668222    2.43024   4.56030 13.8850
## 2 Quantity   3.790625   2.2887019      0.6037796    1.00000   1.00000  2.0000
## 3 Discount   0.168125   0.2161205      1.2854751    0.00000   0.00000  0.0000
## 4   Profit  20.582392 253.5346654     12.3180372 -261.01925 -54.71581  1.0285
##      p_50     p_75      p_95      p_99  skewness   kurtosis      iqr
## 1 50.1800 194.8165 1160.6060 2512.9046  4.855860  34.416659 180.9315
## 2  3.0000   5.0000    8.0000   12.4300  1.444466   5.671234   3.0000
## 3  0.2000   0.2000    0.7000    0.8000  1.587077   4.932676   0.2000
## 4  6.8154  26.8918  135.1641  614.1018 -8.431360 156.294705  25.8633
##                   range_98             range_80
## 1    [2.43024, 2512.90456]     [6.7384, 541.41]
## 2               [1, 12.43]               [2, 7]
## 3                 [0, 0.8]             [0, 0.5]
## 4 [-261.019248, 614.10175] [-20.1558, 87.75887]
plot_num(tra)#niceller
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

freq(tra)#kategorikler
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
##        Ship.Mode frequency percentage cumulative_perc
## 1 Standard Class       204      63.75           63.75
## 2   Second Class        52      16.25           80.00
## 3    First Class        46      14.38           94.38
## 4       Same Day        18       5.62          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##       Segment frequency percentage cumulative_perc
## 1    Consumer       161      50.31           50.31
## 2   Corporate       100      31.25           81.56
## 3 Home Office        59      18.44          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##         Country frequency percentage cumulative_perc
## 1 United States       320        100             100
## Warning in freq_logic(data = data, input = input[i], plot, na.rm, path_out =
## path_out): Skipping plot for variable 'City' (more than 100 categories)
##                 City frequency percentage cumulative_perc
## 1      New York City        30       9.38            9.38
## 2        Los Angeles        26       8.12           17.50
## 3            Chicago        17       5.31           22.81
## 4      San Francisco        14       4.38           27.19
## 5            Houston        12       3.75           30.94
## 6       Philadelphia        12       3.75           34.69
## 7            Seattle        12       3.75           38.44
## 8           Columbus         7       2.19           40.63
## 9          San Diego         7       2.19           42.82
## 10            Dallas         6       1.88           44.70
## 11          Columbia         5       1.56           46.26
## 12      Jacksonville         5       1.56           47.82
## 13           Detroit         4       1.25           49.07
## 14            Newark         4       1.25           50.32
## 15       Springfield         4       1.25           51.57
## 16           Atlanta         3       0.94           52.51
## 17            Aurora         3       0.94           53.45
## 18            Austin         3       0.94           54.39
## 19           Concord         3       0.94           55.33
## 20        Huntsville         3       0.94           56.27
## 21        Long Beach         3       0.94           57.21
## 22        Plantation         3       0.94           58.15
## 23            Quincy         3       0.94           59.09
## 24       San Antonio         3       0.94           60.03
## 25         Arlington         2       0.62           60.65
## 26         Brentwood         2       0.62           61.27
## 27           Buffalo         2       0.62           61.89
## 28           Clinton         2       0.62           62.51
## 29         Fairfield         2       0.62           63.13
## 30   Fort Lauderdale         2       0.62           63.75
## 31          Glendale         2       0.62           64.37
## 32           Hialeah         2       0.62           64.99
## 33           Jackson         2       0.62           65.61
## 34        Louisville         2       0.62           66.23
## 35            Marion         2       0.62           66.85
## 36         Milwaukee         2       0.62           67.47
## 37         Nashville         2       0.62           68.09
## 38          Pasadena         2       0.62           68.71
## 39    Pembroke Pines         2       0.62           69.33
## 40          Richmond         2       0.62           69.95
## 41     Saint Charles         2       0.62           70.57
## 42            Skokie         2       0.62           71.19
## 43       Tallahassee         2       0.62           71.81
## 44            Toledo         2       0.62           72.43
## 45            Tucson         2       0.62           73.05
## 46        Washington         2       0.62           73.67
## 47          Appleton         1       0.31           73.98
## 48          Avondale         1       0.31           74.29
## 49       Bakersfield         1       0.31           74.60
## 50         Baltimore         1       0.31           74.91
## 51            Bangor         1       0.31           75.22
## 52           Baytown         1       0.31           75.53
## 53        Belleville         1       0.31           75.84
## 54     Boynton Beach         1       0.31           76.15
## 55           Bristol         1       0.31           76.46
## 56          Carlsbad         1       0.31           76.77
## 57      Carol Stream         1       0.31           77.08
## 58        Carrollton         1       0.31           77.39
## 59              Cary         1       0.31           77.70
## 60         Charlotte         1       0.31           78.01
## 61  Colorado Springs         1       0.31           78.32
## 62    Cuyahoga Falls         1       0.31           78.63
## 63          Dearborn         1       0.31           78.94
## 64            Denver         1       0.31           79.25
## 65       Des Plaines         1       0.31           79.56
## 66            Dublin         1       0.31           79.87
## 67         Encinitas         1       0.31           80.18
## 68           Everett         1       0.31           80.49
## 69          Florence         1       0.31           80.80
## 70        Fort Worth         1       0.31           81.11
## 71          Freeport         1       0.31           81.42
## 72        Georgetown         1       0.31           81.73
## 73     Grand Prairie         1       0.31           82.04
## 74       Great Falls         1       0.31           82.35
## 75        Greensboro         1       0.31           82.66
## 76       Haltom City         1       0.31           82.97
## 77         Henderson         1       0.31           83.28
## 78    Hendersonville         1       0.31           83.59
## 79           Hickory         1       0.31           83.90
## 80  Huntington Beach         1       0.31           84.21
## 81      Indianapolis         1       0.31           84.52
## 82      Johnson City         1       0.31           84.83
## 83              Kent         1       0.31           85.14
## 84         Knoxville         1       0.31           85.45
## 85         Lakeville         1       0.31           85.76
## 86          Lakewood         1       0.31           86.07
## 87          Lawrence         1       0.31           86.38
## 88            Lawton         1       0.31           86.69
## 89       Little Rock         1       0.31           87.00
## 90             Mason         1       0.31           87.31
## 91           Mcallen         1       0.31           87.62
## 92           Meriden         1       0.31           87.93
## 93          Mesquite         1       0.31           88.24
## 94     Missouri City         1       0.31           88.55
## 95            Monroe         1       0.31           88.86
## 96     Moreno Valley         1       0.31           89.17
## 97      Newport News         1       0.31           89.48
## 98            Orange         1       0.31           89.79
## 99             Pasco         1       0.31           90.10
## 100          Passaic         1       0.31           90.41
## 101          Phoenix         1       0.31           90.72
## 102       Pine Bluff         1       0.31           91.03
## 103       Plainfield         1       0.31           91.34
## 104    Pompano Beach         1       0.31           91.65
## 105       Providence         1       0.31           91.96
## 106 Rancho Cucamonga         1       0.31           92.27
## 107           Revere         1       0.31           92.58
## 108        Rochester         1       0.31           92.89
## 109         Rockford         1       0.31           93.20
## 110        Rockville         1       0.31           93.51
## 111        Roseville         1       0.31           93.82
## 112       Sacramento         1       0.31           94.13
## 113 Saint Petersburg         1       0.31           94.44
## 114            Salem         1       0.31           94.75
## 115   San Bernardino         1       0.31           95.06
## 116    Sandy Springs         1       0.31           95.37
## 117    Santa Barbara         1       0.31           95.68
## 118        Sheboygan         1       0.31           95.99
## 119           Smyrna         1       0.31           96.30
## 120        Sunnyvale         1       0.31           96.61
## 121         Thornton         1       0.31           96.92
## 122           Tigard         1       0.31           97.23
## 123             Troy         1       0.31           97.54
## 124       Twin Falls         1       0.31           97.85
## 125            Utica         1       0.31           98.16
## 126  West Palm Beach         1       0.31           98.47
## 127         Westland         1       0.31           98.78
## 128         Wheeling         1       0.31           99.09
## 129       Wilmington         1       0.31           99.40
## 130          Yonkers         1       0.31          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##                   State frequency percentage cumulative_perc
## 1            California        63      19.69           19.69
## 2              New York        37      11.56           31.25
## 3                 Texas        34      10.62           41.87
## 4              Illinois        25       7.81           49.68
## 5               Florida        17       5.31           54.99
## 6                  Ohio        15       4.69           59.68
## 7            Washington        13       4.06           63.74
## 8          Pennsylvania        12       3.75           67.49
## 9               Georgia         8       2.50           69.99
## 10             Michigan         8       2.50           72.49
## 11       North Carolina         8       2.50           74.99
## 12             Virginia         7       2.19           77.18
## 13              Arizona         6       1.88           79.06
## 14             Colorado         5       1.56           80.62
## 15             Kentucky         5       1.56           82.18
## 16             Maryland         5       1.56           83.74
## 17           New Jersey         5       1.56           85.30
## 18            Tennessee         5       1.56           86.86
## 19             Delaware         4       1.25           88.11
## 20        Massachusetts         4       1.25           89.36
## 21            Wisconsin         4       1.25           90.61
## 22          Connecticut         3       0.94           91.55
## 23              Indiana         3       0.94           92.49
## 24             Missouri         3       0.94           93.43
## 25       South Carolina         3       0.94           94.37
## 26              Alabama         2       0.62           94.99
## 27             Arkansas         2       0.62           95.61
## 28 District of Columbia         2       0.62           96.23
## 29            Minnesota         2       0.62           96.85
## 30                Idaho         1       0.31           97.16
## 31            Louisiana         1       0.31           97.47
## 32                Maine         1       0.31           97.78
## 33              Montana         1       0.31           98.09
## 34        New Hampshire         1       0.31           98.40
## 35           New Mexico         1       0.31           98.71
## 36             Oklahoma         1       0.31           99.02
## 37               Oregon         1       0.31           99.33
## 38         Rhode Island         1       0.31           99.64
## 39        West Virginia         1       0.31          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##    Region frequency percentage cumulative_perc
## 1    West        91      28.44           28.44
## 2    East        91      28.44           56.88
## 3 Central        80      25.00           81.88
## 4   South        58      18.12          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##          Category frequency percentage cumulative_perc
## 1 Office Supplies       189      59.06           59.06
## 2       Furniture        67      20.94           80.00
## 3      Technology        64      20.00          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##    Sub.Category frequency percentage cumulative_perc
## 1       Binders        49      15.31           15.31
## 2         Paper        40      12.50           27.81
## 3        Phones        30       9.38           37.19
## 4   Accessories        29       9.06           46.25
## 5           Art        28       8.75           55.00
## 6        Chairs        27       8.44           63.44
## 7   Furnishings        26       8.12           71.56
## 8       Storage        26       8.12           79.68
## 9        Labels        16       5.00           84.68
## 10   Appliances        13       4.06           88.74
## 11       Tables        11       3.44           92.18
## 12    Fasteners         7       2.19           94.37
## 13    Envelopes         5       1.56           95.93
## 14     Machines         5       1.56           97.49
## 15     Supplies         5       1.56           99.05
## 16    Bookcases         3       0.94          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Indirim_kat frequency percentage cumulative_perc
## 1   indirimli       173      54.06           54.06
## 2 indirim yok       147      45.94          100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

##   Karlilik frequency percentage cumulative_perc
## 1      kar       250      78.12           78.12
## 2    zarar        70      21.88          100.00
## [1] "Variables processed: Ship.Mode, Segment, Country, City, State, Region, Category, Sub.Category, Indirim_kat, Karlilik"

Değişkenlerin hiçbiri normal dağılmamıştır. İlerleyen bölümlerde dönüşüm uygulamamız gerekmektedir.

Ship Mode değişkeninde en çok kullanılan yöntem standart Class olurken en az kullanılan yöntem same day yöntemidir.

Segment içerisinde de en çok alanı consumer segmenti kapsamaktadır.

Tek ülke united states

En çok California eyaletinde işlem yapılmıştır.

Region değişkeninde ise west ve east eşittir.

Category değişkeninde en çok Office Supplies işlem görmüştür.

Sub-Categoryde ise Binders

Genel olarak indirim yapıldığını görüyoruz.

Zarardan çok kar edildiğini gözlemliyoruz.

Kategorik degiskenin duzeyleri bazında, nicel degıskenlerın ozet istatistiklerii

library(psych)
## Warning: package 'psych' was built under R version 4.1.3
## 
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(dplyr)
df  <- dplyr:: select(tra, Profit, Indirim_kat)
describeBy(df, (df$Indirim_kat))
## 
##  Descriptive statistics by group 
## group: indirim yok
##              vars   n  mean     sd median trimmed   mad  min    max  range skew
## Profit          1 147 53.36 119.48  13.79   24.85 15.75 0.41 648.56 648.16 3.81
## Indirim_kat*    2 147  1.00   0.00   1.00    1.00  0.00 1.00   1.00   0.00  NaN
##              kurtosis   se
## Profit          14.35 9.85
## Indirim_kat*      NaN 0.00
## ------------------------------------------------------------ 
## group: indirimli
##              vars   n  mean     sd median trimmed   mad      min     max
## Profit          1 173 -7.27 324.66   2.64    2.77 15.83 -3701.89 1906.48
## Indirim_kat*    2 173  2.00   0.00   2.00    2.00  0.00     2.00    2.00
##                range  skew kurtosis    se
## Profit       5608.38 -7.33   100.95 24.68
## Indirim_kat*    0.00   NaN      NaN  0.00

Çok değişkenli kutu grafikleri

library(ggplot2)
ggplot(tra, aes(x=Category,y=Profit, fill=Category))+
  geom_boxplot()+
  stat_summary(fun = median, geom="line", group= 1, color= "black", size = 1) 

library(ggplot2)
ggplot(tra, aes(x=`Sub-Category`,y=Profit, fill=`Sub-Category`))+
  geom_boxplot()+
  stat_summary(fun = median, geom="line", group= 1, color= "black", size = 1) 

library(ggplot2)
ggplot(tra, aes(x=Region,y=Profit, fill=Region))+
  geom_boxplot()+
  stat_summary(fun = median, geom="line", group= 1, color= "black", size = 1) 

library(ggplot2)
ggplot(tra, aes(x=Segment,y=Profit, fill=Segment))+
  geom_boxplot()+
  stat_summary(fun = median, geom="line", group= 1, color= "black", size = 1) 

library(ggplot2)
ggplot(tra, aes(x=`Ship Mode`,y=Profit, fill=`Ship Mode`))+
  geom_boxplot()+
  stat_summary(fun = median, geom="line", group= 1, color= "black", size = 1) 

Kutu grafikleri sağlıklı sonuçlar vermedi. Verilerin kendi içindeki veya birbirleriyle olan ilişkisindeki çarpıklık gibi bir problemden dolayı olabilir.

Chernoff Yüzleri

library(aplpack)
library(dplyr)

new_data<-tra%>%
  group_by(Region) %>%
  dplyr::summarize(mean_profit = mean(Profit),mean_saless = mean(Sales),mean_discountt = mean(Discount))

faces(new_data[,-1],  labels=as.character(new_data$Region))

## effect of variables:
##  modified item       Var             
##  "height of face   " "mean_profit"   
##  "width of face    " "mean_saless"   
##  "structure of face" "mean_discountt"
##  "height of mouth  " "mean_profit"   
##  "width of mouth   " "mean_saless"   
##  "smiling          " "mean_discountt"
##  "height of eyes   " "mean_profit"   
##  "width of eyes    " "mean_saless"   
##  "height of hair   " "mean_discountt"
##  "width of hair   "  "mean_profit"   
##  "style of hair   "  "mean_saless"   
##  "height of nose  "  "mean_discountt"
##  "width of nose   "  "mean_profit"   
##  "width of ear    "  "mean_saless"   
##  "height of ear   "  "mean_discountt"

En çok kar Batıda gerçekleşmiştir.

Yıldız Grafikler

data_sorted <- tra[order(-tra$Profit),]

Veriyi dilimleme

library(ggplot2)
data_sorted$group <- as.numeric(cut_number(as.numeric(rownames(data_sorted)), 10)) 
library(dplyr)
data_star<-data_sorted %>%
  group_by(group) %>% 
  dplyr::summarize(Satis= mean(Sales), Miktar= mean(Quantity),Fev= mean(Profit))

stars(data_star[,-1], key.loc = c(15,1.25),main = "Starplot",label=row.names(data_star),cex=.7)

Kümeleme yapmak istersek 10 ve 8 birbirine çok yakın. 2 ve 9 birbirine çok yakın.

Temel İstatistikler

Nokta ölçüleri

3 Nokta Özeti

n<-nrow(tra)
train_sorted <- tra[order(tra$Sales),]
a<-(n/2)
b<-(n/2)+1
(train_sorted$Sales[a]+train_sorted$Sales[b])/2 
## [1] 50.18
median(tra$Sales)
## [1] 50.18
mean(tra$Sales)
## [1] 228.1496
hist(tra$Sales)

Sales değişkeninde çarpıklık bulunmakta

5 Nokta özeti

fivenum(tra$Sales) 
## [1]    1.504   13.870   50.180  195.281 5083.960

Değişim ölçüleri

stdev<-sd(tra$Sales)
mean<-mean(tra$Sales)
Degisim_kats_sales<-(stdev/mean)*100

MAD (Median Absolute Deviation)

sort <- tra[order(tra$Sales),]
medianf<-median(tra$Sales)
sort$fmed<-abs(sort$Sales-medianf)
sort2 <- sort[order(sort$fmed),]
mad<-median(sort2$fmed)

Genişletilmiş Nokta Özeti

Sol kuyruk

sol <- function(x) {
  c(quantile(x, probs = 1/2) , 
    quantile(x, probs = 1/4),
    quantile(x, probs =1/8 ),
    quantile(x,probs=1/16),
    quantile(x,probs=1/32),
    quantile(x,probs=1/64)
  )
}

Sağ kuyruk

sag <- function(x) {
  c(quantile(x, probs = 1/2) , 
    quantile(x, probs = 3/4),
    quantile(x, probs = 7/8),
    quantile(x,probs=15/16),
    quantile(x,probs=31/32),
    quantile(x,probs=63/64)
  )
}

Kuyruk uzunluğu incelemesi

x_a<-sol(tra$Profit)
x_u<-sag(tra$Profit)
x_mrg<-as.data.frame(cbind(x_a,x_u))
rownames(x_mrg)<-c("1/2","1/4","1/8","1/16","1/32","1/64")
colnames(x_mrg)<-c("Alt_Kuyruk","Ust_Kuyruk")
x_mrg$orta_nokta<-(x_mrg$Alt_Kuyruk+x_mrg$Ust_Kuyruk)/2
x_mrg
##      Alt_Kuyruk Ust_Kuyruk orta_nokta
## 1/2     6.81540    6.81540    6.81540
## 1/4     1.02850   26.89180   13.96015
## 1/8   -10.04861   76.57651   33.26395
## 1/16  -38.38016  115.60927   38.61456
## 1/32  -84.32235  227.98207   71.82986
## 1/64 -133.18305  544.98190  205.89943
hist(tra$Profit)

Profit değişkeninin de çarpık olduğunu görebiliyoruz.

Kesilmiş ortalama

p<-0.1
mean(tra$Profit, trim = p)
## [1] 14.51672
#Kalan gozlem sayısı hesaplanmak istenirse:
n<-nrow(tra)
ks<-n-(as.integer(2*p*n)) 
ks
## [1] 256

Ortalama değerimiz olan 14.51 ’e karşılık gelen 256 gözlemimiz var

Geometrik ortalama

library("psych")
geometric.mean(tra$Sales)
## [1] 56.07413

Gini

freq <- as.data.frame(table(tra$Category))
names(freq)[1] <- 'Kategori'

gini <- function(a,b) {
  a1 <- (a/(a+b))**2
  b1 <- (b/(a+b))**2
  x<-1-(a1 + b1)
  return(x)
}
gn<-gini(freq[1,2],freq[2,2])
k<-2
gn/((k-1)/k)
## [1] 0.7558937
freq <- as.data.frame(table(tra$`Indirim_kat`))
names(freq)[1] <- 'Kategori'

gini <- function(a,b) {
  a1 <- (a/(a+b))**2
  b1 <- (b/(a+b))**2
  x<-1-(a1 + b1)
  return(x)
}
gn<-gini(freq[1,2],freq[2,2])
k<-2
gn/((k-1)/k)
## [1] 0.9933984
freq <- as.data.frame(table(tra$Region))
names(freq)[1] <- 'Kategori'

gini <- function(a,b) {
  a1 <- (a/(a+b))**2
  b1 <- (b/(a+b))**2
  x<-1-(a1 + b1)
  return(x)
}
gn<-gini(freq[1,2],freq[2,2])
k<-2
gn/((k-1)/k)
## [1] 0.995862

Entropi

entropy<-function(base,a,b) {
  var <-  abs(((a)/(a+b))*log(((a)/(a+b)),base))-(((b)/(a+b))*log(((b)/(a+b)),base))
  return(var)
}
ent<-entropy(10,freq[1,2],freq[2,2])
k<-2
ent/(log(k,10)) 
## [1] 0.997013

Entropi değeri oldukça yüksek çıkmıştır. Değişkenlik çoktur.

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x psych::%+%()       masks ggplot2::%+%()
## x psych::alpha()     masks ggplot2::alpha()
## x dplyr::filter()    masks mice::filter(), stats::filter()
## x dplyr::lag()       masks stats::lag()
## x Hmisc::src()       masks dplyr::src()
## x Hmisc::summarize() masks dplyr::summarize()
ggplot(tra, aes(Sales,Profit))+
  geom_point(size=2,shape=21,stroke=1,color="dodgerblue1", fill="white")+
  geom_smooth(method = "lm", col="darkblue",se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

“Sales” değişkeni ile “Profit” değişkeni arasında doğrusal olmama problemi olduğunu söyleyebiliriz. Satışın artmasıyla karın arttığını gözlemleyebiliyoruz. İki değişkenin ilişkisi ile çıkarılan grafikte olası iki adet aykırı değer olabilecek değer olduğunu görüyoruz. Şu an için çıkarmamayı tercih ediyoruz. İlerleyen süreçlerde model içerisinde problem çıkarırlarsa o zaman bu değişkenler hakkında tekrar düşünülebilir.

ggplot(tra,aes(x=Sales,y=Profit))+
  geom_point(size=1)+
  geom_text(label=rownames(tra),nudge_x=0.25,nudge_y=0.25, check_overlap=T)+
  geom_smooth(method=lm,col="red",se=FALSE)
## `geom_smooth()` using formula 'y ~ x'

library(ggExtra)
## Warning: package 'ggExtra' was built under R version 4.1.3
gr<-ggplot(tra,aes(x=Sales,y=Profit))+
  geom_point()+
  geom_text(size=3,label=rownames(tra),nudge_x=0.25,
            nudge_y=0.25, check_overlap=T)+
  geom_smooth(method=lm,col="brown1", se=FALSE)

ggMarginal(gr,type="histogram",fill="darksalmon")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

Daha detaylı bilgi sahibi olmak için bar plot grafiği de ekledik. Verilerin normal dağılmadığını gözlemliyoruz.

Yukarıda aykırı değer olabileceği hakkında söz edilen değerlerin 181 ve 187 değerleri olduğunu görüyoruz.

ggplot(tra,aes(x=Discount,y=Profit))+
  geom_point(size=1)+
  geom_text(label=rownames(tra),nudge_x=0.25,nudge_y=0.25, check_overlap=T)+
  geom_smooth(method=lm,col="red",se=FALSE)
## `geom_smooth()` using formula 'y ~ x'

Discount değişkeni için aykırı değer olabilecek 181 ve 187 değerleri vardır ve dönüşüme ihtiyaç vardır.

ggplot(tra,aes(x=Quantity,y=Profit))+
  geom_point(size=1)+
  geom_text(label=rownames(tra),nudge_x=0.25,nudge_y=0.25, check_overlap=T)+
  geom_smooth(method=lm,col="red",se=FALSE)
## `geom_smooth()` using formula 'y ~ x'

Quantity değişkeni için aykırı değer olabilecek yine 181 ve 187 değerleri vardır ve dönüşüme ihtiyaç vardır.

library(plotly)
## Warning: package 'plotly' was built under R version 4.1.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:Hmisc':
## 
##     subplot
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
d_plot <- ggplot(tra, aes(Discount, Profit, fill=Region, shape=Region)) +
  geom_point(position = position_jitter(width= 0.2, height = 0), size = 2)

ggplotly(d_plot)

İndirim sağlamanın genel olarak kara çok etki etmediğini görüyoruz. Yalnızca batı bölgesinde 20% lik bir indirim sonrası çok büyük bir kara ulaşılmış ve Merkezde yapılan 80% lik bir indirim sonrası da çok büyük bir zarara uğranmıştır. Bunlar aykırı değer olabilirler. Fakat etkili değer olabileceklerinden verimizde tutmaya devam ediyoruz.

Kabarcık çizimi

library(ggplot2)
ggplot(tra, aes(Sales,Profit, color=Discount, size=Discount))+
  geom_point(alpha=0.5, stroke=2)+
  scale_size(range = c(1, 8))+
  scale_color_gradient(low = "blue", high = "lightpink")

Altıgen Çizim

ggplot(tra,aes(x=Discount,y=Profit))+
  geom_hex(bins=20, color = "white")+
  scale_fill_gradient(low="mistyrose2", high="violetred3")

Aralarında doğrusal olmama problemi olduğunu söyleyebiliriz. Gözlemler en çok indirimin ve karın 0 olduğu yerde bulunmaktadır.

Kontür Çizimi

ggplot(tra, aes(x=Sales, y=Profit) ) +
  stat_density_2d(aes(fill = ..level..), geom = "polygon")+
  scale_fill_distiller(palette = "Blues")

“Profit” değişkeni ve “Sales” değişkeni arasında değişen varyanslılık problemi olduğunu söyleyebiliriz.

Koşullu matris çizimi (?)

Korelasyon Haritası

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.1.3
## corrplot 0.92 loaded
corrplot(cor(tra[,9:12]), method = "ellipse")

corrplot.mixed(cor(tra[,9:12]), lower = "number", 
               upper = "square",tl.col = "black") 

Korelasyon haritasına bakarak nicel değişkenlerin birbirleriyle ilişkisinin çok yüksek olmadığını söyleyebiliriz.

Ortanca ve DAG değerleri bulmak.

Bölgelere göre Kar üzerinden ortanca ve DAG değerlerini bulalım.

library(dplyr)
a<-tra %>% group_by(Region) %>%
  dplyr:: summarize(Q1=quantile (Profit, probs=0.25), Median=quantile (Profit, probs=0.50), Q3=quantile(Profit, probs=0.75), DAG=Q3-Q1)
a
## # A tibble: 4 x 5
##   Region      Q1 Median    Q3   DAG
##   <fct>    <dbl>  <dbl> <dbl> <dbl>
## 1 West      3.53   8.88  26.8  23.3
## 2 Central -19.2    2.93  13.2  32.5
## 3 East      1.66   9.33  36.9  35.2
## 4 South     2.03   7.31  32.6  30.6

Kategorilere göre Kar üzerinden ortanca ve DAG değerlerini bulalım.

library(dplyr)
b<-tra %>%group_by(Category) %>%
  dplyr:: summarize(Q1=quantile (Profit, probs=0.25), Median=quantile (Profit, probs=0.50), Q3=quantile(Profit, probs=0.75), DAG=Q3-Q1)
b
## # A tibble: 3 x 5
##   Category            Q1 Median    Q3   DAG
##   <fct>            <dbl>  <dbl> <dbl> <dbl>
## 1 Technology        5.51  29.4   70.9  65.4
## 2 Office Supplies   1.80   5.66  14.8  13.0
## 3 Furniture       -25.2    6.22  28.0  53.2

Taşıma şekline göre Kar üzerinden ortanca ve DAG değerlerini bulalım.

library(dplyr)
c<-tra %>%group_by(`Ship Mode` , .drop = FALSE) %>%
  dplyr::  summarize(Q1=quantile (Profit , probs=0.25), Median=quantile (Profit, probs=0.50), Q3=quantile(Profit, probs=0.75), DAG=Q3-Q1)
c
## # A tibble: 4 x 5
##   `Ship Mode`         Q1 Median    Q3   DAG
##   <fct>            <dbl>  <dbl> <dbl> <dbl>
## 1 Second Class     2.80   19.0   47.5  44.7
## 2 Standard Class   0.540   6.22  25.3  24.8
## 3 First Class      1.19    7.14  14.9  13.7
## 4 Same Day       -21.9     7.19  20.7  42.6

Alt kategorilere göre Kar üzerinden ortanca ve DAG değerlerini bulalım.

library(dplyr)
d<-tra %>%group_by( `Sub-Category`) %>%
   dplyr:: summarize(Q1=quantile (Profit, probs=0.25), Median=quantile (Profit, probs=0.50), Q3=quantile(Profit, probs=0.75), DAG=Q3-Q1)
d
## # A tibble: 16 x 5
##    `Sub-Category`     Q1 Median     Q3    DAG
##    <fct>           <dbl>  <dbl>  <dbl>  <dbl>
##  1 Accessories     16.2  57.6    87.9   71.7 
##  2 Appliances     -20.3  10.9    26.3   46.7 
##  3 Art              1.36  2.63    7.38   6.02
##  4 Binders         -4.92  2.34    9.88  14.8 
##  5 Bookcases       -2.10  8.50   60.4   62.5 
##  6 Chairs         -25.2  10.3    63.6   88.7 
##  7 Envelopes        5.34  7.60    8.40   3.06
##  8 Fasteners        1.96  3.60    5.40   3.45
##  9 Furnishings     -4.58  5.53   14.9   19.5 
## 10 Labels           3.88  6.12   16.9   13.0 
## 11 Machines       -16.4   9.00  216.   232.  
## 12 Paper            5.26  8.86   25.3   20.0 
## 13 Phones           2.02 21.1    35.8   33.8 
## 14 Storage          2.73  7.91   22.7   20.0 
## 15 Supplies         3.60 11.6    15.4   11.8 
## 16 Tables         -95.7  -0.889  24.6  120.

Ortanca izi çizimi

plot(a$Region,a$Median, xlab="Region", ylab="Ortanca", main="Ortanca izi cizimi")

plot(b$Category,b$Median, xlab="Category", ylab="Ortanca", main="Ortanca izi cizimi")

plot(c$`Ship Mode`,c$Median, xlab="Ship Mode", ylab="Ortanca", main="Ortanca izi cizimi")

plot(d$`Sub-Category`,d$Median, xlab="Sub-Category", ylab="Ortanca", main="Ortanca izi cizimi")

Konum-Varyans Cizimi

ggplot(a, aes(x=Median,y=DAG, color=Region, group=1))+
  geom_point(size=4,alpha=0.6)+
  geom_line(color="black")

Varyanslar homojen değil dönüşüme ihtiyaç var.

ggplot(b, aes(x=Median,y=DAG, color=Category, group=1))+
  geom_point(size=4,alpha=0.6)+
  geom_line(color="black")

Varyanslar homojen değil dönüşüme ihtiyaç var.

ggplot(c, aes(x=Median,y=DAG, color=`Ship Mode`, group=1))+
  geom_point(size=4,alpha=0.6)+
  geom_line(color="black")

Varyanslar homojen değil dönüşüme ihtiyaç var.

ggplot(d, aes(x=Median,y=DAG, color=`Sub-Category`, group=1))+
  geom_point(size=4,alpha=0.6)+
  geom_line(color="black")

Varyanslar homojen değil dönüşüme ihtiyaç var.

Etkileşim

etk_train<-tra%>%
  group_by(Region,Category)%>% 
  summarise(Median=median(Profit))
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
etk_train
## # A tibble: 12 x 3
## # Groups:   Region [4]
##    Region  Category        Median
##    <fct>   <fct>            <dbl>
##  1 West    Technology       30.8 
##  2 West    Office Supplies   6.47
##  3 West    Furniture        10.1 
##  4 Central Technology       15.5 
##  5 Central Office Supplies   3.63
##  6 Central Furniture       -26.1 
##  7 East    Technology       36.9 
##  8 East    Office Supplies   6.81
##  9 East    Furniture        15.5 
## 10 South   Technology       29.9 
## 11 South   Office Supplies   6.13
## 12 South   Furniture         8.50
ggplot(etk_train, aes(x = Category, y = Median,color=Region,group=Region)) +
  geom_line() +
  geom_point()

Dönüşüm

tra$sales_log<-log10(tra$Sales)
hist(tra$sales_log)

tra$sales_kok <- sqrt(tra$Sales)
hist(tra$sales_kok)

Sales değişkeni için log dönüşümünün yeterli olduğunu söyleyebiliriz. Log dönüşümünde normale yakınsadı.

tra$discount_log<-log10(tra$Discount + 1 - min(tra$Discount))
hist(tra$discount_log)

tra$discount_kok <- sqrt(tra$Discount + 1 - min(tra$Discount))
hist(tra$discount_kok)

Discount değişkeni için log dönüşümünü kullanmayı tercih ediyoruz.

tra$profit_log<-log10(tra$Profit + 1 - min(tra$Profit))
hist(tra$profit_log)

tra$profit_kok <- sqrt(tra$Profit + 1 - min(tra$Profit))
hist(tra$profit_kok)

tra$profit_kare <-(tra$Profit)^(-1)
hist(tra$profit_kare)

Profit değişkeni için de ters dönüşüm tercih ediyoruz.

tra$quantity_log<-log10(tra$Quantity)
hist(tra$quantity_log)

tra$quantity_kok <- sqrt(tra$Quantity)
hist(tra$quantity_kok)

Quantity değişkeni için log dönüşümünü uygulamayı tercih ediyoruz.

Düzleştirme

ggplot(tra, aes(sales_log,profit_kare,label=rownames(tra)))+
  geom_point(size=1)+
  geom_text(label=rownames(tra),nudge_x=0.04,check_overlap=T,size=2.5)+
  geom_smooth(method = "loess", col="darkblue",se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).

ggplot(tra, aes(discount_log,profit_kare,label=rownames(tra)))+
  geom_point(size=1)+
  geom_text(label=rownames(tra),nudge_x=0.04,check_overlap=T,size=2.5)+
  geom_smooth(method = "loess", col="darkblue",se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).

ggplot(tra, aes(quantity_log,profit_kare , label=rownames(tra)))+
  geom_point(size=1)+
  geom_text(label=rownames(tra),nudge_x=0.04,check_overlap=T,size=2.5)+
  geom_smooth(method = "loess", col="darkblue",se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).

lower_bound_quantity_log <- median(tra$quantity_log) - 3 * mad(tra$quantity_log, constant = 1)
lower_bound_quantity_log
## [1] -0.05115252
upper_bound_quantity_log <- median(tra$quantity_log) + 3 * mad(tra$quantity_log, constant = 1)
upper_bound_quantity_log
## [1] 1.005395
outlier_ind_quantity_log <- which(tra$quantity_log < lower_bound_discount | tra$quantity_log > upper_bound_discount)
outlier_ind_quantity_log
##  [1]   8  16  21  22  25  27  31  34  55  61  74  76  89  91 133 160 164 176 177
## [20] 184 187 198 200 205 215 227 230 243 252 253 259 266 267 268 269 289 295 298
## [39] 299 311 313
lower_bound_sales_log <- median(tra$sales_log) - 3 * mad(tra$sales_log, constant = 1)
lower_bound_sales_log
## [1] -0.007087954
upper_bound_sales_log <- median(tra$sales_log) + 3 * mad(tra$sales_log, constant = 1)
upper_bound_sales_log
## [1] 3.407871
outlier_ind_sales_log <- which(tra$sales_log < lower_bound_sales_log | tra$sales_log > upper_bound_sales_log)
outlier_ind_sales_log
## [1]  63 129 164 181
lower_bound_discount_log <- median(tra$discount_log) - 3 * mad(tra$discount_log, constant = 1)
lower_bound_discount_log
## [1] -0.1583625
upper_bound_discount_log <- median(tra$discount_log) + 3 * mad(tra$discount_log, constant = 1)
upper_bound_discount_log
## [1] 0.316725
outlier_ind_discount_log <- which(tra$discount_log < lower_bound_discount_log | tra$discount_log > upper_bound_discount_log)
outlier_ind_discount_log
## integer(0)
lower_bound_profit_kare <- median(tra$profit_kare) - 3 * mad(tra$profit_kare, constant = 1)
lower_bound_profit_kare
## [1] -0.1548746
upper_bound_profit_kare <- median(tra$profit_kare) + 3 * mad(tra$profit_kare, constant = 1)
upper_bound_profit_kare
## [1] 0.2412646
outlier_ind_profit_kare <- which(tra$profit_kare < lower_bound_profit_kare | tra$profit_kare > upper_bound_profit_kare)
outlier_ind_profit_kare
##  [1]   6   7   9  11  15  17  18  26  33  45  50  52  54  66  67  74  77  80  83
## [20]  85  87  96  98 101 103 107 111 112 121 128 131 134 138 142 143 150 151 152
## [39] 153 154 157 161 163 175 178 185 191 193 202 204 207 209 211 214 216 221 223
## [58] 224 238 245 251 256 260 272 274 279 281 283 285 286 296 304 306 309

Dönüşümler aykırı/uç değerleri önemli ölçüde azalttığını görüyoruz. Etkili değer olabilecekleri için çıkarmamayı tercih ediyoruz.

Sales değişkenimizi merkezileştiriyoruz.

mean_sales<-mean(tra$Sales)
tra$sales_merkez<-(tra$Profit-mean_sales)
ggplot(tra, aes(x = sales_merkez, y =profit_kare )) +
  stat_smooth(method = "lm", se = FALSE, color = "green", formula = y ~ x) +
  stat_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x + I(x ^ 2)) +
  stat_smooth(method = "lm", se = FALSE, color = "red", formula = y ~ x + I(x ^ 2)+ I(x ^ 3)) +
  geom_point(colour = "black", size = 1)
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Removed 1 rows containing non-finite values (stat_smooth).
## Removed 1 rows containing non-finite values (stat_smooth).

Kırmızı çizgimiz noktalarla en iyi uyuşan çizgidir. Kübik dönüşüm gerekmektedir.

loglu sales’i merkezleştirip karesel terimlere bakıyoruz

mean_saleslog<-mean(tra$sales_log)
tra$sales_log_merkez<-(tra$sales_log-mean_saleslog)
ggplot(tra, aes(x = sales_log_merkez, y =profit_kare )) +
  stat_smooth(method = "lm", se = FALSE, color = "green", formula = y ~ x) +
  stat_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x + I(x ^ 2)) +
  stat_smooth(method = "lm", se = FALSE, color = "red", formula = y ~ x + I(x ^ 2)+ I(x ^ 3)) +
  geom_point(colour = "black", size = 1)
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Removed 1 rows containing non-finite values (stat_smooth).
## Removed 1 rows containing non-finite values (stat_smooth).

ggplot(tra, aes(x = sales_log_merkez, y =profit_kare )) +
  stat_smooth(method = "lm", se = FALSE, color = "magenta", formula = y ~ x) +
  stat_smooth(method = "lm", se = FALSE, color = "green", formula = y ~ x + I(x ^ 2)) +
  stat_smooth(method = "lm", se = FALSE, color = "cyan", formula = y ~ x + I(x ^ 2)+ I(x ^ 3)) +
  geom_point(colour = "black", size = 1)
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Removed 1 rows containing non-finite values (stat_smooth).
## Removed 1 rows containing non-finite values (stat_smooth).

Tukey’s Ladder

library(rcompanion)
## Warning: package 'rcompanion' was built under R version 4.1.3
## Registered S3 method overwritten by 'DescTools':
##   method       from   
##   plot.bagplot aplpack
## 
## Attaching package: 'rcompanion'
## The following object is masked from 'package:psych':
## 
##     phi
profit_tukey<-transformTukey(tra$Profit + 1 - min(tra$Profit),plotit=FALSE)
## 
##     lambda      W Shapiro.p.value
## 482  2.025 0.3295       1.575e-32
## 
## if (lambda >  0){TRANS = x ^ lambda} 
## if (lambda == 0){TRANS = log(x)} 
## if (lambda <  0){TRANS = -1 * x ^ lambda}
profit_tukey<- transformTukey(tra$Sales, plotit=FALSE)
## 
##     lambda      W Shapiro.p.value
## 398 -0.075 0.9881         0.01017
## 
## if (lambda >  0){TRANS = x ^ lambda} 
## if (lambda == 0){TRANS = log(x)} 
## if (lambda <  0){TRANS = -1 * x ^ lambda}

Box - Cox

Profit için

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
Box_profit<- boxcox(tra$Profit + 1 - min(tra$Profit) ~ 1,            
                 lambda = seq(-6,6,0.1))      

Cox_profit<- data.frame(Box_profit$x, Box_profit$y) 
Cox_profit <- Cox_profit[order(-Cox_profit$Box_profit.y),]  
Cox_profit[1,] 
##    Box_profit.x Box_profit.y
## 81            2       2.6096
lambda <- Cox_profit[1, "Box_profit.x"]
lambda
## [1] 2

Sales için

library(MASS)

Box_sales<- boxcox(tra$Sales ~ 1,            
                 lambda = seq(-6,6,0.1))      

Cox_sales<- data.frame(Box_sales$x, Box_sales$y) 
Cox_sales <- Cox_sales[order(-Cox_sales$Box_sales.y),]  
Cox_sales[1,] 
##    Box_sales.x Box_sales.y
## 60        -0.1   -1088.712
lambda <- Cox_sales[1, "Box_sales.x"]
lambda
## [1] -0.1

Ham hali üzerinden saçılım matrisi

Çarpıklık gözlemi

orj<-tra[,c(9,10,11,12)]
library(PerformanceAnalytics)
## Warning: package 'PerformanceAnalytics' was built under R version 4.1.3
## Zorunlu paket yükleniyor: xts
## Warning: package 'xts' was built under R version 4.1.3
## Zorunlu paket yükleniyor: zoo
## Warning: package 'zoo' was built under R version 4.1.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
chart.Correlation(orj, histogram=TRUE, pch=19)

transform_train<-tra[,c(15,21,17,19)] 
chart.Correlation(transform_train, histogram=TRUE, pch=19)

# Birliktelik İstatistikleri

dt1<-table(tra$Category,tra$Region)
prop.table(dt1,2) 
##                  
##                        West   Central      East     South
##   Technology      0.1978022 0.1500000 0.2197802 0.2413793
##   Office Supplies 0.5824176 0.6125000 0.5494505 0.6379310
##   Furniture       0.2197802 0.2375000 0.2307692 0.1206897
round(100*prop.table(dt1,2), 2) 
##                  
##                    West Central  East South
##   Technology      19.78   15.00 21.98 24.14
##   Office Supplies 58.24   61.25 54.95 63.79
##   Furniture       21.98   23.75 23.08 12.07
addmargins(round(prop.table(dt1,2), 2),1)
##                  
##                   West Central East South
##   Technology      0.20    0.15 0.22  0.24
##   Office Supplies 0.58    0.61 0.55  0.64
##   Furniture       0.22    0.24 0.23  0.12
##   Sum             1.00    1.00 1.00  1.00

prop.table(data.matrix(rowsum(…)), 1)

dt1<-table(tra$Category,tra$Region)
prop.table(data.matrix(rowsum(2,1)), 1)
##   [,1]
## 1    1
round(100*prop.table(dt1,2), 2) 
##                  
##                    West Central  East South
##   Technology      19.78   15.00 21.98 24.14
##   Office Supplies 58.24   61.25 54.95 63.79
##   Furniture       21.98   23.75 23.08 12.07
library("gplots")
## Warning: package 'gplots' was built under R version 4.1.3
## Registered S3 method overwritten by 'gplots':
##   method         from     
##   reorder.factor DescTools
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:PerformanceAnalytics':
## 
##     textplot
## The following object is masked from 'package:stats':
## 
##     lowess
balloonplot(t(dt1), main ="Category ve Region  ", xlab ="", ylab="",
            label = FALSE,show.margins = FALSE)

Office Supplies kategorisi her bölgede çok daha fazladır.

dt2<-table(tra$`Sub-Category`,tra$Region)
prop.table(dt2,2) 
##              
##                     West    Central       East      South
##   Accessories 0.08791209 0.07500000 0.08791209 0.12068966
##   Appliances  0.02197802 0.06250000 0.02197802 0.06896552
##   Art         0.13186813 0.06250000 0.07692308 0.06896552
##   Binders     0.09890110 0.15000000 0.20879121 0.15517241
##   Bookcases   0.01098901 0.00000000 0.01098901 0.01724138
##   Chairs      0.12087912 0.10000000 0.06593407 0.03448276
##   Copiers     0.00000000 0.00000000 0.00000000 0.00000000
##   Envelopes   0.00000000 0.00000000 0.04395604 0.01724138
##   Fasteners   0.04395604 0.01250000 0.00000000 0.03448276
##   Furnishings 0.03296703 0.10000000 0.13186813 0.05172414
##   Labels      0.07692308 0.03750000 0.02197802 0.06896552
##   Machines    0.01098901 0.01250000 0.01098901 0.03448276
##   Paper       0.12087912 0.17500000 0.08791209 0.12068966
##   Phones      0.09890110 0.06250000 0.12087912 0.08620690
##   Storage     0.06593407 0.10000000 0.07692308 0.08620690
##   Supplies    0.02197802 0.01250000 0.01098901 0.01724138
##   Tables      0.05494505 0.03750000 0.02197802 0.01724138
round(100*prop.table(dt2,2), 2) 
##              
##                West Central  East South
##   Accessories  8.79    7.50  8.79 12.07
##   Appliances   2.20    6.25  2.20  6.90
##   Art         13.19    6.25  7.69  6.90
##   Binders      9.89   15.00 20.88 15.52
##   Bookcases    1.10    0.00  1.10  1.72
##   Chairs      12.09   10.00  6.59  3.45
##   Copiers      0.00    0.00  0.00  0.00
##   Envelopes    0.00    0.00  4.40  1.72
##   Fasteners    4.40    1.25  0.00  3.45
##   Furnishings  3.30   10.00 13.19  5.17
##   Labels       7.69    3.75  2.20  6.90
##   Machines     1.10    1.25  1.10  3.45
##   Paper       12.09   17.50  8.79 12.07
##   Phones       9.89    6.25 12.09  8.62
##   Storage      6.59   10.00  7.69  8.62
##   Supplies     2.20    1.25  1.10  1.72
##   Tables       5.49    3.75  2.20  1.72
addmargins(round(prop.table(dt2,2), 2),1)
##              
##               West Central East South
##   Accessories 0.09    0.07 0.09  0.12
##   Appliances  0.02    0.06 0.02  0.07
##   Art         0.13    0.06 0.08  0.07
##   Binders     0.10    0.15 0.21  0.16
##   Bookcases   0.01    0.00 0.01  0.02
##   Chairs      0.12    0.10 0.07  0.03
##   Copiers     0.00    0.00 0.00  0.00
##   Envelopes   0.00    0.00 0.04  0.02
##   Fasteners   0.04    0.01 0.00  0.03
##   Furnishings 0.03    0.10 0.13  0.05
##   Labels      0.08    0.04 0.02  0.07
##   Machines    0.01    0.01 0.01  0.03
##   Paper       0.12    0.17 0.09  0.12
##   Phones      0.10    0.06 0.12  0.09
##   Storage     0.07    0.10 0.08  0.09
##   Supplies    0.02    0.01 0.01  0.02
##   Tables      0.05    0.04 0.02  0.02
##   Sum         0.99    0.98 1.00  1.01
dt3<-table(tra$Segment,tra$Region)
prop.table(dt3,2) 
##              
##                    West   Central      East     South
##   Consumer    0.4505495 0.5000000 0.5824176 0.4655172
##   Home Office 0.1868132 0.1875000 0.1428571 0.2413793
##   Corporate   0.3626374 0.3125000 0.2747253 0.2931034
round(100*prop.table(dt3,2), 2) 
##              
##                West Central  East South
##   Consumer    45.05   50.00 58.24 46.55
##   Home Office 18.68   18.75 14.29 24.14
##   Corporate   36.26   31.25 27.47 29.31
addmargins(round(prop.table(dt3,2), 2),1)
##              
##               West Central East South
##   Consumer    0.45    0.50 0.58  0.47
##   Home Office 0.19    0.19 0.14  0.24
##   Corporate   0.36    0.31 0.27  0.29
##   Sum         1.00    1.00 0.99  1.00
library("gplots")
balloonplot(t(dt3), main ="Segment ve Region  ", xlab ="", ylab="",
            label = FALSE,show.margins = FALSE)

Consumer segmenti her bölgede daha baskın gelmektedir.

dt_c<-table(tra$Region,tra$Region)
dtc_exp <- chisq.test(dt_c)$expected
rowcs <- function(i, obs, exp) {
  sum(((obs[i,] - exp[i,])^2)/exp[i,])
}

chi_dtc<-as.matrix(lapply(seq_len(nrow(dt_c)), rowcs, obs = dt_c, exp = dtc_exp))
rownames(chi_dtc)<-rownames(dt_c)
chi_dtc
##         [,1]
## West    229 
## Central 240 
## East    229 
## South   262
library(inspectdf)
## Warning: package 'inspectdf' was built under R version 4.1.3
library(dplyr)
tra %>% inspect_types()
## # A tibble: 2 x 4
##   type      cnt  pcnt col_name    
##   <chr>   <int> <dbl> <named list>
## 1 numeric    15    60 <chr [15]>  
## 2 factor     10    40 <chr [10]>
tra_cat<-tra %>% inspect_cat()
tra_cat$levels$Region
## # A tibble: 4 x 3
##   value    prop   cnt
##   <chr>   <dbl> <int>
## 1 East    0.284    91
## 2 West    0.284    91
## 3 Central 0.25     80
## 4 South   0.181    58
tra_cat %>% show_plot()

Model Geliştirme ve Geçerlilik

Test kümesi

library(readxl)
test <- read_excel("C:/Users/SİMAY/Desktop/Lessons/Veri Analizi/verianalizi_proje/test.xlsx")
View(test)
test$`Ship Mode` <- factor(test$`Ship Mode`, levels=c("Second Class","Standard Class","First Class","Same Day"))
test$Segment <- factor(test$Segment, levels=c ("Consumer","Home Office","Corporate"))
test$Country <- factor(test$Country, levels=c ("United States"))
test$Country <- factor(test$Country, levels=c ("United States"))
test$Region <- factor(test$Region, levels=c("West","Central","East","South"))
test$Region <- factor(test$Region, levels=c("West","Central","East","South"))
test$Category <- factor(test$Category, levels=c("Technology","Office Supplies","Furniture"))
test$`Sub-Category` <- as.factor(test$`Sub-Category`)
test$City <- as.factor(test$City)
test$State <- as.factor(test$State)
Quantity <- as.numeric(test$Quantity)
Discount <- as.numeric(test$Discount)
Profit <- as.numeric(test$Profit)
Sales <- as.numeric(test$Sales)
summary(test)
##           Ship Mode         Segment            Country              City   
##  Second Class  :16   Consumer   :46   United States:80   New York City: 5  
##  Standard Class:43   Home Office:12                      Seattle      : 5  
##  First Class   :17   Corporate  :22                      Houston      : 4  
##  Same Day      : 4                                       Chicago      : 3  
##                                                          Los Angeles  : 3  
##                                                          Philadelphia : 3  
##                                                          (Other)      :57  
##         State        Region              Category       Sub-Category
##  California:18   West   :29   Technology     :11   Paper      :15   
##  New York  : 8   Central:17   Office Supplies:51   Binders    :13   
##  Texas     : 8   East   :23   Furniture      :18   Appliances : 9   
##  Illinois  : 5   South  :11                        Furnishings: 9   
##  Ohio      : 5                                     Chairs     : 7   
##  Washington: 5                                     Phones     : 5   
##  (Other)   :31                                     (Other)    :22   
##      Sales              Quantity         Discount          Profit        
##  Min.   :    2.264   Min.   : 1.000   Min.   :0.0000   Min.   :-559.356  
##  1st Qu.:   18.549   1st Qu.: 2.000   1st Qu.:0.0000   1st Qu.:   3.015  
##  Median :   44.046   Median : 3.000   Median :0.2000   Median :  10.326  
##  Mean   :  408.921   Mean   : 4.088   Mean   :0.1663   Mean   : 109.089  
##  3rd Qu.:  278.036   3rd Qu.: 5.000   3rd Qu.:0.2000   3rd Qu.:  36.824  
##  Max.   :13999.960   Max.   :11.000   Max.   :0.8000   Max.   :6719.981  
## 
summary(test)
##           Ship Mode         Segment            Country              City   
##  Second Class  :16   Consumer   :46   United States:80   New York City: 5  
##  Standard Class:43   Home Office:12                      Seattle      : 5  
##  First Class   :17   Corporate  :22                      Houston      : 4  
##  Same Day      : 4                                       Chicago      : 3  
##                                                          Los Angeles  : 3  
##                                                          Philadelphia : 3  
##                                                          (Other)      :57  
##         State        Region              Category       Sub-Category
##  California:18   West   :29   Technology     :11   Paper      :15   
##  New York  : 8   Central:17   Office Supplies:51   Binders    :13   
##  Texas     : 8   East   :23   Furniture      :18   Appliances : 9   
##  Illinois  : 5   South  :11                        Furnishings: 9   
##  Ohio      : 5                                     Chairs     : 7   
##  Washington: 5                                     Phones     : 5   
##  (Other)   :31                                     (Other)    :22   
##      Sales              Quantity         Discount          Profit        
##  Min.   :    2.264   Min.   : 1.000   Min.   :0.0000   Min.   :-559.356  
##  1st Qu.:   18.549   1st Qu.: 2.000   1st Qu.:0.0000   1st Qu.:   3.015  
##  Median :   44.046   Median : 3.000   Median :0.2000   Median :  10.326  
##  Mean   :  408.921   Mean   : 4.088   Mean   :0.1663   Mean   : 109.089  
##  3rd Qu.:  278.036   3rd Qu.: 5.000   3rd Qu.:0.2000   3rd Qu.:  36.824  
##  Max.   :13999.960   Max.   :11.000   Max.   :0.8000   Max.   :6719.981  
## 

Sales değişkeni için logaritmik dönüşüm

test$sales_log<-log10(test$Sales)

Discount değişkeni için logaritmik dönüşüm

test$discount_log<-log10(test$Discount + 1 - min(test$Discount))

profit değişkeni için karesel

test$profit_kare<-(test$Profit)^(-1)

quantity değişkeni için log dönüşümü

test$quantity_log<-log10(test$Quantity)
test$sales_log_merkez<-(test$sales_log-mean(test$sales_log))

Seçenek modeller

fit1<-lm(Profit ~ Sales+Region+Category, data=tra)
summary(fit1)
## 
## Call:
## lm(formula = Profit ~ Sales + Region + Category, data = tra)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3928.2   -12.8    -0.5    33.8  1221.0 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              30.67877   39.46675   0.777    0.438    
## Sales                     0.13181    0.02722   4.843 2.02e-06 ***
## RegionCentral           -76.12399   37.46741  -2.032    0.043 *  
## RegionEast               -3.76213   36.29057  -0.104    0.918    
## RegionSouth             -23.09435   41.23675  -0.560    0.576    
## CategoryOffice Supplies -15.26947   36.12577  -0.423    0.673    
## CategoryFurniture       -32.78026   42.94763  -0.763    0.446    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 244.2 on 313 degrees of freedom
## Multiple R-squared:  0.09003,    Adjusted R-squared:  0.07258 
## F-statistic: 5.161 on 6 and 313 DF,  p-value: 4.452e-05
fit1_1<-lm(Profit ~ Sales+Region, data=tra)
summary(fit1_1)
## 
## Call:
## lm(formula = Profit ~ Sales + Region, data = tra)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3927.2   -13.4    -2.8    40.0  1219.8 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    14.46371   26.58628   0.544   0.5868    
## Sales           0.13223    0.02644   5.000 9.52e-07 ***
## RegionCentral -77.14647   37.35287  -2.065   0.0397 *  
## RegionEast     -3.58490   36.17903  -0.099   0.9211    
## RegionSouth   -20.65248   41.01398  -0.504   0.6149    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 243.6 on 315 degrees of freedom
## Multiple R-squared:  0.08832,    Adjusted R-squared:  0.07675 
## F-statistic: 7.629 on 4 and 315 DF,  p-value: 7.053e-06

##Tahmin

predictions <- predict(fit1_1, test) #test uzerınden

Model performans

library(caret)
## Warning: package 'caret' was built under R version 4.1.3
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
## The following object is masked from 'package:survival':
## 
##     cluster
#train:
round(defaultSummary(data.frame(obs=tra$Profit,pred=predict(fit1_1,tra))),3)
##     RMSE Rsquared      MAE 
##  241.701    0.088   57.465
#merkezilestirilmis uzerinden
library(DataCombine)
## Warning: package 'DataCombine' was built under R version 4.1.3
tra[is.na(tra) | tra == "Inf"] = NA
dn <- DropNA(tra)
## No Var specified. Dropping all NAs from the data frame.
## 1 rows dropped from the data frame because of missing values.
#test:
round(defaultSummary(data.frame(obs=test$Profit,pred=predict(fit1_1,test))),2)
##     RMSE Rsquared      MAE 
##   552.27     0.89   108.11
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.1.3
ggplot2::autoplot(fit1_1)

## Modelleme - polinomial

fit2<-lm(profit_kare ~ sales_log_merkez + I(sales_log_merkez^2)+I(sales_log_merkez^3)+Region+Category , data = tra)
summary(fit2)
## 
## Call:
## lm(formula = profit_kare ~ sales_log_merkez + I(sales_log_merkez^2) + 
##     I(sales_log_merkez^3) + Region + Category, data = tra)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6644 -0.0714  0.0020  0.0766  4.5973 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              0.037640   0.071886   0.524   0.6009    
## sales_log_merkez         0.051179   0.070785   0.723   0.4702    
## I(sales_log_merkez^2)    0.244900   0.043044   5.689 2.95e-08 ***
## I(sales_log_merkez^3)   -0.190157   0.044085  -4.313 2.16e-05 ***
## RegionCentral           -0.158039   0.066153  -2.389   0.0175 *  
## RegionEast              -0.081299   0.064598  -1.259   0.2091    
## RegionSouth             -0.115977   0.073213  -1.584   0.1142    
## CategoryOffice Supplies  0.094066   0.072365   1.300   0.1946    
## CategoryFurniture        0.008236   0.075943   0.108   0.9137    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4299 on 310 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1914, Adjusted R-squared:  0.1705 
## F-statistic: 9.169 on 8 and 310 DF,  p-value: 2.475e-11
fit2<-lm(profit_kare ~ sales_log +Region, data = tra)
summary(fit2)
## 
## Call:
## lm(formula = profit_kare ~ sales_log + Region, data = tra)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3737 -0.1003 -0.0238  0.0725  5.4821 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.50052    0.07808   6.410 5.33e-10 ***
## sales_log     -0.17862    0.03451  -5.176 4.05e-07 ***
## RegionCentral -0.15938    0.06958  -2.291   0.0226 *  
## RegionEast    -0.04076    0.06732  -0.605   0.5453    
## RegionSouth   -0.07085    0.07623  -0.929   0.3534    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4526 on 314 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.09206,    Adjusted R-squared:  0.0805 
## F-statistic:  7.96 on 4 and 314 DF,  p-value: 4.015e-06
fit2_res<-as.data.frame(t(defaultSummary(data.frame(obs=tra$profit_kare,pred=predict(fit2,tra)))))
rownames(fit2_res)<-"fit2"
fit3<-lm(profit_kare ~ sales_log_merkez + I(sales_log_merkez^2)+ I(sales_log_merkez^3)+Region+Region*sales_log_merkez , data = tra)
summary(fit3)
## 
## Call:
## lm(formula = profit_kare ~ sales_log_merkez + I(sales_log_merkez^2) + 
##     I(sales_log_merkez^3) + Region + Region * sales_log_merkez, 
##     data = tra)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7048 -0.0631 -0.0086  0.0559  4.5709 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.097684   0.049754   1.963 0.050505 .  
## sales_log_merkez               -0.077295   0.084982  -0.910 0.363770    
## I(sales_log_merkez^2)           0.237325   0.044240   5.364 1.59e-07 ***
## I(sales_log_merkez^3)          -0.162955   0.043256  -3.767 0.000198 ***
## RegionCentral                  -0.159020   0.066007  -2.409 0.016575 *  
## RegionEast                     -0.085244   0.064469  -1.322 0.187062    
## RegionSouth                    -0.110497   0.072848  -1.517 0.130335    
## sales_log_merkez:RegionCentral  0.162768   0.091109   1.787 0.074993 .  
## sales_log_merkez:RegionEast     0.008373   0.088434   0.095 0.924632    
## sales_log_merkez:RegionSouth    0.109622   0.100730   1.088 0.277324    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4291 on 309 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.197,  Adjusted R-squared:  0.1736 
## F-statistic: 8.424 on 9 and 309 DF,  p-value: 2.919e-11
fit3_res<-as.data.frame(t(defaultSummary(data.frame(obs=tra$profit_kare,pred=predict(fit3,tra)))))
rownames(fit3_res)<-"fit3"
fit4<-lm(profit_kare ~ Sales+Region+Category, data = tra)
summary(fit4)
## 
## Call:
## lm(formula = profit_kare ~ Sales + Region + Category, data = tra)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3424 -0.1397 -0.0331  0.0498  5.6699 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)   
## (Intercept)              8.226e-02  7.495e-02   1.098  0.27323   
## Sales                   -5.786e-05  5.163e-05  -1.121  0.26332   
## RegionCentral           -1.562e-01  7.123e-02  -2.193  0.02905 * 
## RegionEast              -2.747e-02  6.901e-02  -0.398  0.69085   
## RegionSouth             -7.390e-02  7.831e-02  -0.944  0.34608   
## CategoryOffice Supplies  1.855e-01  6.849e-02   2.709  0.00712 **
## CategoryFurniture        2.525e-02  8.171e-02   0.309  0.75751   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4629 on 312 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.05642,    Adjusted R-squared:  0.03828 
## F-statistic: 3.109 on 6 and 312 DF,  p-value: 0.005669
fit4<-lm(profit_kare ~ Sales+Region, data = tra)
summary(fit4)
## 
## Call:
## lm(formula = profit_kare ~ Sales + Region, data = tra)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3295 -0.1303 -0.0633  0.0307  5.7403 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    2.075e-01  5.147e-02   4.032 6.96e-05 ***
## Sales         -9.562e-05  5.091e-05  -1.878   0.0613 .  
## RegionCentral -1.528e-01  7.208e-02  -2.120   0.0348 *  
## RegionEast    -3.744e-02  6.983e-02  -0.536   0.5922    
## RegionSouth   -7.089e-02  7.912e-02  -0.896   0.3710    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4689 on 314 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.02553,    Adjusted R-squared:  0.01312 
## F-statistic: 2.057 on 4 and 314 DF,  p-value: 0.08632
fit4_res<-as.data.frame(t(defaultSummary(data.frame(obs=tra$profit_kare,pred=predict(fit4,tra)))))
rownames(fit4_res)<-"fit4"
fit5<-lm(profit_kare ~ Sales, data = tra)
summary(fit5)
## 
## Call:
## lm(formula = profit_kare ~ Sales, data = tra)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3754 -0.1110 -0.0683  0.0329  5.7652 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.451e-01  2.879e-02   5.040 7.84e-07 ***
## Sales       -9.339e-05  5.092e-05  -1.834   0.0676 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4703 on 317 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.0105, Adjusted R-squared:  0.007379 
## F-statistic: 3.364 on 1 and 317 DF,  p-value: 0.06758
fit5_res<-as.data.frame(t(defaultSummary(data.frame(obs=tra$profit_kare,pred=predict(fit5,tra)))))
rownames(fit5_res)<-"fit5"
#test icin:
fit2_res_test<-as.data.frame(t(defaultSummary(data.frame(obs=test$profit_kare,pred=predict(fit2,test)))))
rownames(fit2_res_test)<-"fit2"
fit3_res_test<-as.data.frame(t(defaultSummary(data.frame(obs=test$profit_kare,pred=predict(fit3,test)))))
rownames(fit3_res_test)<-"fit3"
fit4_res_test<-as.data.frame(t(defaultSummary(data.frame(obs=test$profit_kare,pred=predict(fit4,test)))))
rownames(fit4_res_test)<-"fit4"
fit5_res_test<-as.data.frame(t(defaultSummary(data.frame(obs=test$profit_kare,pred=predict(fit5,test)))))
rownames(fit5_res_test)<-"fit5"
round(rbind(fit2_res_test,fit3_res_test,fit4_res_test,fit5_res_test),2)
##      RMSE Rsquared  MAE
## fit2 0.18     0.18 0.13
## fit3 0.24     0.09 0.14
## fit4 0.21     0.04 0.15
## fit5 0.22     0.01 0.15
list2<-list(fit2,fit3,fit4,fit5)
PRESS <- function(linmodel) {   pr <- residuals(linmodel)/(1 - lm.influence(linmodel)$hat)
sum(pr^2)
}
for (i in list2) {
  print(paste("Press:",round(PRESS(i),3)))
}
## [1] "Press: 67.076"
## [1] "Press: 69.091"
## [1] "Press: 70.856"
## [1] "Press: 70.733"
library(ggfortify)
autoplot(fit2)

library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.3
cart<-rpart(profit_kare~sales_log+Region+Category , data=tra)
cart$variable.importance
##  sales_log     Region   Category 
## 13.5219358  1.5659918  0.6717293
prp(cart, type=5)

SONUÇ

Yaptığımız analizler sonucunda kârı artırmak için yüksek kâr getiren “Copiers” yani fotokopi makinelerine ağırlık verilmesi gerektiğini gözlemliyoruz. Özellikle bu ağırlığın merkez bölge için daha dazla özelleştirilmesi gerektiğini söyleyebiliriz. En büyük zararı ise Doğu bölgesinde ” Machines” için gözlemledik, fakat zaman zaman makineler doğu bölgesinde kâr sağlamıştır. En yüksek zarara neden olan makinenin hangisi olduğunu tespit edip bunu düzeltmek üzerine yoğunlaşabiliriz. Ve yine Doğu bölgesinde “Supplies” değişkeninin sadece zarara uğrattığını gözlemleyebiliriz. Mobilyalar için ise “Tables” değişkeni Güney bölgesinde zarara uğratmıştır. “Bookcases” değişkeni ise yine Doğu bölgesinde zarar getirmiştir. Modellerden ise en az hatayı ve en yüksek R^2 değerini veren 2.modeli seçiyoruz.